home *** CD-ROM | disk | FTP | other *** search
/ PC World Komputer 2010 April / PCWorld0410.iso / hity wydania / Ubuntu 9.10 PL / karmelkowy-koliberek-desktop-9.10-i386-PL.iso / casper / filesystem.squashfs / usr / share / pyshared / drv_libxml2.py < prev    next >
Text File  |  2009-10-08  |  15KB  |  372 lines

  1. # -*- coding: iso-8859-1 -*-
  2. """ A SAX2 driver for libxml2, on top of it's XmlReader API
  3.  
  4. USAGE
  5.     # put this file (drv_libxml2.py) in PYTHONPATH
  6.     import xml.sax
  7.     reader = xml.sax.make_parser(["drv_libxml2"])
  8.     # ...and the rest is standard python sax.
  9.  
  10. CAVEATS
  11.     - Lexical handlers are supported, except for start/endEntity
  12.       (waiting for XmlReader.ResolveEntity) and start/endDTD
  13.     - Error callbacks are not exactly synchronous, they tend
  14.       to be invoked before the corresponding content callback,
  15.       because the underlying reader interface parses
  16.       data by chunks of 512 bytes
  17.     
  18. TODO
  19.     - search for TODO
  20.     - some ErrorHandler events (warning)
  21.     - some ContentHandler events (setDocumentLocator, skippedEntity)
  22.     - EntityResolver (using libxml2.?)
  23.     - DTDHandler (if/when libxml2 exposes such node types)
  24.     - DeclHandler (if/when libxml2 exposes such node types)
  25.     - property_xml_string?
  26.     - feature_string_interning?
  27.     - Incremental parser
  28.     - additional performance tuning:
  29.       - one might cache callbacks to avoid some name lookups
  30.       - one might implement a smarter way to pass attributes to startElement
  31.         (some kind of lazy evaluation?)
  32.       - there might be room for improvement in start/endPrefixMapping
  33.       - other?
  34.  
  35. """
  36.  
  37. __author__  = u"StΘphane Bidoul <sbi@skynet.be>"
  38. __version__ = "0.3"
  39.  
  40. import codecs
  41. from types import StringType, UnicodeType
  42. StringTypes = (StringType,UnicodeType)
  43.  
  44. from xml.sax._exceptions import *
  45. from xml.sax import xmlreader, saxutils
  46. from xml.sax.handler import \
  47.      feature_namespaces, \
  48.      feature_namespace_prefixes, \
  49.      feature_string_interning, \
  50.      feature_validation, \
  51.      feature_external_ges, \
  52.      feature_external_pes, \
  53.      property_lexical_handler, \
  54.      property_declaration_handler, \
  55.      property_dom_node, \
  56.      property_xml_string
  57.  
  58. # libxml2 returns strings as UTF8
  59. _decoder = codecs.lookup("utf8")[1]
  60. def _d(s):
  61.     if s is None:
  62.         return s
  63.     else:
  64.         return _decoder(s)[0]
  65.  
  66. try:
  67.     import libxml2
  68. except ImportError, e:
  69.     raise SAXReaderNotAvailable("libxml2 not available: " \
  70.                                 "import error was: %s" % e)
  71.  
  72. class Locator(xmlreader.Locator):
  73.     """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
  74.  
  75.     def __init__(self,locator):
  76.         self.__locator = locator
  77.  
  78.     def getColumnNumber(self):
  79.         "Return the column number where the current event ends."
  80.         return -1
  81.  
  82.     def getLineNumber(self):
  83.         "Return the line number where the current event ends."
  84.         return self.__locator.LineNumber()
  85.  
  86.     def getPublicId(self):
  87.         "Return the public identifier for the current event."
  88.         return None
  89.  
  90.     def getSystemId(self):
  91.         "Return the system identifier for the current event."
  92.         return self.__locator.BaseURI()
  93.  
  94. class LibXml2Reader(xmlreader.XMLReader):
  95.  
  96.     def __init__(self):
  97.         xmlreader.XMLReader.__init__(self)
  98.         # features
  99.         self.__ns = 0
  100.         self.__nspfx = 0
  101.         self.__validate = 0
  102.         self.__extparams = 1
  103.         # parsing flag
  104.         self.__parsing = 0
  105.         # additional handlers
  106.         self.__lex_handler = None
  107.         self.__decl_handler = None
  108.         # error messages accumulator
  109.         self.__errors = None
  110.  
  111.     def _errorHandler(self,arg,msg,severity,locator):
  112.         if self.__errors is None:
  113.             self.__errors = []
  114.         self.__errors.append((severity,
  115.                               SAXParseException(msg,None,
  116.                                                 Locator(locator))))
  117.  
  118.     def _reportErrors(self,fatal):
  119.         for severity,exception in self.__errors:
  120.             if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
  121.                             libxml2.PARSER_SEVERITY_WARNING):
  122.                 self._err_handler.warning(exception)
  123.             else:
  124.                 # when fatal is set, the parse will stop;
  125.                 # we consider that the last error reported
  126.                 # is the fatal one.
  127.                 if fatal and exception is self.__errors[-1][1]:
  128.                     self._err_handler.fatalError(exception)
  129.                 else:
  130.                     self._err_handler.error(exception)
  131.         self.__errors = None
  132.  
  133.     def parse(self, source):
  134.         self.__parsing = 1
  135.         try:
  136.             # prepare source and create reader
  137.             if type(source) in StringTypes:
  138.                 reader = libxml2.newTextReaderFilename(source)
  139.             else:
  140.                 source = saxutils.prepare_input_source(source)
  141.                 input = libxml2.inputBuffer(source.getByteStream())
  142.                 reader = input.newTextReader(source.getSystemId())
  143.             reader.SetErrorHandler(self._errorHandler,None)
  144.             # configure reader
  145.             if self.__extparams:
  146.                 reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
  147.                 reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
  148.                 reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
  149.                 reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
  150.             else:
  151.                 reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
  152.             # we reuse attribute maps (for a slight performance gain)
  153.             if self.__ns:
  154.                 attributesNSImpl = xmlreader.AttributesNSImpl({},{})
  155.             else:
  156.                 attributesImpl = xmlreader.AttributesImpl({})
  157.             # prefixes to pop (for endPrefixMapping)
  158.             prefixes = []
  159.             # start loop
  160.             self._cont_handler.startDocument()
  161.             while 1:
  162.                 r = reader.Read()
  163.                 # check for errors
  164.                 if r == 1:
  165.                     if not self.__errors is None:
  166.                         self._reportErrors(0)
  167.                 elif r == 0:
  168.                     if not self.__errors is None:
  169.                         self._reportErrors(0)
  170.                     break # end of parse
  171.                 else:
  172.                     if not self.__errors is None:
  173.                         self._reportErrors(1)
  174.                     else:
  175.                         self._err_handler.fatalError(\
  176.                             SAXException("Read failed (no details available)"))
  177.                     break # fatal parse error
  178.                 # get node type
  179.                 nodeType = reader.NodeType()
  180.                 # Element
  181.                 if nodeType == 1: 
  182.                     if self.__ns:
  183.                         eltName = (_d(reader.NamespaceUri()),\
  184.                                    _d(reader.LocalName()))
  185.                         eltQName = _d(reader.Name())
  186.                         attributesNSImpl._attrs = attrs = {}
  187.                         attributesNSImpl._qnames = qnames = {}
  188.                         newPrefixes = []
  189.                         while reader.MoveToNextAttribute():
  190.                             qname = _d(reader.Name())
  191.                             value = _d(reader.Value())
  192.                             if qname.startswith("xmlns"):
  193.                                 if len(qname) > 5:
  194.                                     newPrefix = qname[6:]
  195.                                 else:
  196.                                     newPrefix = None
  197.                                 newPrefixes.append(newPrefix)
  198.                                 self._cont_handler.startPrefixMapping(\
  199.                                     newPrefix,value)
  200.                                 if not self.__nspfx:
  201.                                     continue # don't report xmlns attribute
  202.                             attName = (_d(reader.NamespaceUri()),
  203.                                        _d(reader.LocalName()))
  204.                             qnames[attName] = qname
  205.                             attrs[attName] = value
  206.                         reader.MoveToElement()
  207.                         self._cont_handler.startElementNS( \
  208.                             eltName,eltQName,attributesNSImpl) 
  209.                         if reader.IsEmptyElement():
  210.                             self._cont_handler.endElementNS(eltName,eltQName)
  211.                             for newPrefix in newPrefixes:
  212.                                 self._cont_handler.endPrefixMapping(newPrefix)
  213.                         else:
  214.                             prefixes.append(newPrefixes)
  215.                     else:
  216.                         eltName = _d(reader.Name())
  217.                         attributesImpl._attrs = attrs = {}
  218.                         while reader.MoveToNextAttribute():
  219.                             attName = _d(reader.Name())
  220.                             attrs[attName] = _d(reader.Value())
  221.                         reader.MoveToElement()
  222.                         self._cont_handler.startElement( \
  223.                             eltName,attributesImpl)
  224.                         if reader.IsEmptyElement():
  225.                             self._cont_handler.endElement(eltName)
  226.                 # EndElement
  227.                 elif nodeType == 15: 
  228.                     if self.__ns:
  229.                         self._cont_handler.endElementNS( \
  230.                              (_d(reader.NamespaceUri()),_d(reader.LocalName())),
  231.                              _d(reader.Name()))
  232.                         for prefix in prefixes.pop():
  233.                             self._cont_handler.endPrefixMapping(prefix)
  234.                     else:
  235.                         self._cont_handler.endElement(_d(reader.Name()))
  236.                 # Text
  237.                 elif nodeType == 3: 
  238.                     self._cont_handler.characters(_d(reader.Value()))
  239.                 # Whitespace
  240.                 elif nodeType == 13: 
  241.                     self._cont_handler.ignorableWhitespace(_d(reader.Value()))
  242.                 # SignificantWhitespace
  243.                 elif nodeType == 14:
  244.                     self._cont_handler.characters(_d(reader.Value()))
  245.                 # CDATA
  246.                 elif nodeType == 4:
  247.                     if not self.__lex_handler is None:
  248.                         self.__lex_handler.startCDATA()
  249.                     self._cont_handler.characters(_d(reader.Value()))
  250.                     if not self.__lex_handler is None:
  251.                         self.__lex_handler.endCDATA()
  252.                 # EntityReference
  253.                 elif nodeType == 5:
  254.                     if not self.__lex_handler is None:
  255.                         self.startEntity(_d(reader.Name()))
  256.                     reader.ResolveEntity()
  257.                 # EndEntity
  258.                 elif nodeType == 16:
  259.                     if not self.__lex_handler is None:
  260.                         self.endEntity(_d(reader.Name()))
  261.                 # ProcessingInstruction
  262.                 elif nodeType == 7: 
  263.                     self._cont_handler.processingInstruction( \
  264.                         _d(reader.Name()),_d(reader.Value()))
  265.                 # Comment
  266.                 elif nodeType == 8:
  267.                     if not self.__lex_handler is None:
  268.                         self.__lex_handler.comment(_d(reader.Value()))
  269.                 # DocumentType
  270.                 elif nodeType == 10:
  271.                     #if not self.__lex_handler is None:
  272.                     #    self.__lex_handler.startDTD()
  273.                     pass # TODO (how to detect endDTD? on first non-dtd event?)
  274.                 # XmlDeclaration
  275.                 elif nodeType == 17:
  276.                     pass # TODO
  277.                 # Entity
  278.                 elif nodeType == 6:
  279.                     pass # TODO (entity decl)
  280.                 # Notation (decl)
  281.                 elif nodeType == 12:
  282.                     pass # TODO
  283.                 # Attribute (never in this loop)
  284.                 #elif nodeType == 2: 
  285.                 #    pass
  286.                 # Document (not exposed)
  287.                 #elif nodeType == 9: 
  288.                 #    pass
  289.                 # DocumentFragment (never returned by XmlReader)
  290.                 #elif nodeType == 11:
  291.                 #    pass
  292.                 # None
  293.                 #elif nodeType == 0:
  294.                 #    pass
  295.                 # -
  296.                 else:
  297.                     raise SAXException("Unexpected node type %d" % nodeType)
  298.             if r == 0:
  299.                 self._cont_handler.endDocument()
  300.             reader.Close()
  301.         finally:
  302.             self.__parsing = 0
  303.  
  304.     def setDTDHandler(self, handler):
  305.         # TODO (when supported, the inherited method works just fine)
  306.         raise SAXNotSupportedException("DTDHandler not supported")
  307.  
  308.     def setEntityResolver(self, resolver):
  309.         # TODO (when supported, the inherited method works just fine)
  310.         raise SAXNotSupportedException("EntityResolver not supported")
  311.  
  312.     def getFeature(self, name):
  313.         if name == feature_namespaces:
  314.             return self.__ns
  315.         elif name == feature_namespace_prefixes:
  316.             return self.__nspfx
  317.         elif name == feature_validation:
  318.             return self.__validate
  319.         elif name == feature_external_ges:
  320.             return 1 # TODO (does that relate to PARSER_LOADDTD)?
  321.         elif name == feature_external_pes:
  322.             return self.__extparams
  323.         else:
  324.             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
  325.                                             name)
  326.  
  327.     def setFeature(self, name, state):
  328.         if self.__parsing:
  329.             raise SAXNotSupportedException("Cannot set feature %s " \
  330.                                            "while parsing" % name)
  331.         if name == feature_namespaces:
  332.             self.__ns = state
  333.         elif name == feature_namespace_prefixes:
  334.             self.__nspfx = state
  335.         elif name == feature_validation:
  336.             self.__validate = state
  337.         elif name == feature_external_ges:
  338.             if state == 0:
  339.                 # TODO (does that relate to PARSER_LOADDTD)?
  340.                 raise SAXNotSupportedException("Feature '%s' not supported" % \
  341.                                                name)
  342.         elif name == feature_external_pes:
  343.             self.__extparams = state
  344.         else:
  345.             raise SAXNotRecognizedException("Feature '%s' not recognized" % \
  346.                                             name)
  347.  
  348.     def getProperty(self, name):
  349.         if name == property_lexical_handler:
  350.             return self.__lex_handler
  351.         elif name == property_declaration_handler:
  352.             return self.__decl_handler
  353.         else:
  354.             raise SAXNotRecognizedException("Property '%s' not recognized" % \
  355.                                             name)
  356.  
  357.     def setProperty(self, name, value):     
  358.         if name == property_lexical_handler:
  359.             self.__lex_handler = value
  360.         elif name == property_declaration_handler:
  361.             # TODO: remove if/when libxml2 supports dtd events
  362.             raise SAXNotSupportedException("Property '%s' not supported" % \
  363.                                            name)
  364.             self.__decl_handler = value
  365.         else:
  366.             raise SAXNotRecognizedException("Property '%s' not recognized" % \
  367.                                             name)
  368.  
  369. def create_parser():
  370.     return LibXml2Reader()
  371.  
  372.